#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
PDF文本提取工具
将PDF文件转换为文本文件，以便AI可以读取
"""

import sys
import os

def extract_pdf_with_pypdf2(pdf_path, output_path):
    """使用PyPDF2库提取文本"""
    try:
        import PyPDF2
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ""
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text += f"\n--- 第 {page_num + 1} 页 ---\n"
                text += page.extract_text()
            
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(text)
        return True
    except ImportError:
        return False
    except Exception as e:
        print(f"PyPDF2提取失败: {e}")
        return False

def extract_pdf_with_pdfplumber(pdf_path, output_path):
    """使用pdfplumber库提取文本（更好的中文支持）"""
    try:
        import pdfplumber
        text = ""
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                text += f"\n--- 第 {page_num + 1} 页 ---\n"
                page_text = page.extract_text()
                if page_text:
                    text += page_text
                else:
                    text += "[无法提取此页文本，可能是扫描版或图像]\n"
        
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(text)
        return True
    except ImportError:
        return False
    except Exception as e:
        print(f"pdfplumber提取失败: {e}")
        return False

def main():
    if len(sys.argv) < 2:
        print("用法: python extract_pdf_text.py <PDF文件路径> [输出文件路径]")
        print("\n示例:")
        print('  python extract_pdf_text.py "翻牌游戏.pdf"')
        print('  python extract_pdf_text.py "实验四+翻牌游戏应用.pdf" "output.txt"')
        sys.exit(1)
    
    pdf_path = sys.argv[1]
    
    if not os.path.exists(pdf_path):
        print(f"错误: 文件不存在: {pdf_path}")
        sys.exit(1)
    
    # 确定输出文件名
    if len(sys.argv) >= 3:
        output_path = sys.argv[2]
    else:
        output_path = os.path.splitext(pdf_path)[0] + "_extracted.txt"
    
    print(f"正在提取PDF文本: {pdf_path}")
    print(f"输出文件: {output_path}")
    
    # 尝试使用pdfplumber（更好的中文支持）
    if extract_pdf_with_pdfplumber(pdf_path, output_path):
        print(f"✓ 成功提取文本到: {output_path}")
        return
    
    # 尝试使用PyPDF2
    if extract_pdf_with_pypdf2(pdf_path, output_path):
        print(f"✓ 成功提取文本到: {output_path}")
        return
    
    # 如果都不可用，提示安装
    print("\n错误: 未找到PDF处理库")
    print("\n请安装以下库之一:")
    print("  方法1（推荐）: pip install pdfplumber")
    print("  方法2: pip install PyPDF2")
    print("\n安装后重新运行此脚本")

if __name__ == "__main__":
    main()

